import pandas as pd #Data Manipulation
import numpy as np #Data Manipulation
import matplotlib.pyplot as plt #Plotting
import seaborn as sns #Plotting
sns.set(style='white')
from sklearn import preprocessing #Preprocessing
from scipy.stats import skew, boxcox_normmax #Preprocessing
from scipy.special import boxcox1p #Preprocessing
from sklearn.model_selection import train_test_split #Train/Test Split
from sklearn.linear_model import LogisticRegression #Model
from sklearn.metrics import classification_report #Metrics
from sklearn.metrics import confusion_matrix #Metrics
from sklearn.metrics import accuracy_score #Metrics
from sklearn.metrics import roc_auc_score, roc_curve #ROC
from sklearn import model_selection #Cross Validation
from sklearn.feature_selection import RFE, RFECV #Feature Selection
hr = pd.read_csv('C:\\Users\\RIA SHARMA\\Desktop\\data\\data\\turnover.csv')
hr.head()
# Shape of the data frame
print('Rows:', hr.shape[0], '| Columns:', hr.shape[1])
# Describe each variable
def df_desc(df):
import pandas as pd
desc = pd.DataFrame({'dtype': df.dtypes,
'NAs': df.isna().sum(),
'Numerical': (df.dtypes != 'object') & (df.apply(lambda column: column == 0).sum() + df.apply(lambda column: column == 1).sum() != len(df)),
'Boolean': df.apply(lambda column: column == 0).sum() + df.apply(lambda column: column == 1).sum() == len(df),
'Categorical': df.dtypes == 'object',
})
return desc
df_desc(hr)
# Summarize numercial variables
hr.describe()
# Lists values of categorical variables
categories = {'sales': hr['sales'].unique().tolist(),
'salary':hr['salary'].unique().tolist()}
for i in sorted(categories.keys()):
print(i+":")
print(categories[i])
if i != sorted(categories.keys())[-1] :print("\n")
# Rename variable sales
hr = hr.rename(index=str, columns={'sales':'department'})
# Count occurences of each values in left
hr['left'].value_counts()
# Get the mean of each variable for the different values of left
hr.groupby('left').mean()
# Correlation Matrix
plt.figure(figsize=(12,8))
sns.heatmap(hr.corr(), cmap='RdBu', annot=True)
plt.tight_layout()
# Pair Plot
plot = sns.PairGrid(hr, hue='left', palette=('steelblue', 'crimson'))
plot = plot.map_diag(plt.hist)
plot = plot.map_offdiag(plt.scatter)
plot.add_legend()
plt.tight_layout()
# Salary Levels proportions and turnover rates
print('Salary Levels proportions')
print(hr['salary'].value_counts()/len(hr)*100)
print('\n')
print('Turnover Rate by Salary level')
print(hr.groupby('salary')['left'].mean())
# Departments proportions
hr['department'].value_counts()/len(hr)*100
# Turnover Rate by Department
hr.groupby('department')['left'].mean().sort_values(ascending=False).plot(kind='bar', color='steelblue')
plt.title('Departure Ratio by Department')
plt.xlabel('')
plt.tight_layout()
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.satisfaction_level,
bins = 20,
color = 'steelblue').axes.set_xlim(min(hr.satisfaction_level),max(hr.satisfaction_level))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['satisfaction_level'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.last_evaluation,
bins = 20,
color = 'steelblue').axes.set_xlim(min(hr.last_evaluation),max(hr.last_evaluation))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['last_evaluation'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.number_project,
bins = 20,
color = 'steelblue').axes.set_xlim(min(hr.number_project),max(hr.number_project))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['number_project'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot
plt.figure(figsize=(15,5))
sns.distplot(hr.average_montly_hours,
bins = 20,
color = 'steelblue').axes.set_xlim(min(hr.average_montly_hours),max(hr.average_montly_hours))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['average_montly_hours'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['time_spend_company'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['Work_accident'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.countplot(hr['promotion_last_5years'],
hue = hr['left'],
palette = ('steelblue', 'crimson'))
plt.tight_layout()
print('Turnover Rate if Promotion:', round(len(hr[(hr['promotion_last_5years']==1)&(hr['left']==1)])/len(hr[(hr['promotion_last_5years']==1)])*100,2),'%')
print('Turnover Rate if No Promotion:', round(len(hr[(hr['promotion_last_5years']==0)&(hr['left']==1)])/len(hr[(hr['promotion_last_5years']==0)])*100,2),'%')
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.average_montly_hours,
y=hr.number_project,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.average_montly_hours,
y=hr.number_project,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.last_evaluation,
y=hr.number_project,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.last_evaluation,
y=hr.number_project,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.average_montly_hours,
y=hr.last_evaluation,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.average_montly_hours,
y=hr.last_evaluation,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Bar Plot with left values
plt.figure(figsize=(15,5))
sns.barplot(x=hr.satisfaction_level,
y=hr.last_evaluation,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Scatter Plot with left values
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr.satisfaction_level,
y=hr.last_evaluation,
hue=hr.left,
palette = ('steelblue', 'crimson'))
plt.tight_layout()
# Encoding the variable salary
salary_dict = {'low':0,'medium':1,'high':2}
hr['salary_num'] = hr.salary.map(salary_dict)
hr.drop('salary', inplace=True, axis=1)
hr = hr.rename(index=str, columns={'salary_num':'salary'})
hr.head()
def numerical_features(df):
columns = df.columns
return df._get_numeric_data().columns
def categorical_features(df):
numerical_columns = numerical_features(df)
return(list(set(df.columns) - set(numerical_columns)))
def onehot_encode(df):
numericals = df.get(numerical_features(df))
new_df = numericals.copy()
for categorical_column in categorical_features(df):
new_df = pd.concat([new_df,
pd.get_dummies(df[categorical_column],
prefix=categorical_column)],
axis=1)
return new_df
hr_encoded = onehot_encode(hr)
hr_encoded.head()
df_desc(hr_encoded)
hr_encoded[['satisfaction_level',
'last_evaluation',
'average_montly_hours'
]].hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
hr_encoded[['satisfaction_level',
'last_evaluation',
'average_montly_hours'
]].describe()
scaler = preprocessing.MinMaxScaler()
hr_scaled_part = scaler.fit_transform(hr_encoded[['satisfaction_level',
'last_evaluation',
'average_montly_hours']])
hr_scaled_part = pd.DataFrame(hr_scaled_part, columns=list(['satisfaction_level',
'last_evaluation',
'average_montly_hours']))
hr_scaled_part[['satisfaction_level',
'last_evaluation',
'average_montly_hours']].hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
hr_scaled_part.describe()
def feature_skewness(df):
numeric_dtypes = ['int16', 'int32', 'int64',
'float16', 'float32', 'float64']
numeric_features = []
for i in df.columns:
if df[i].dtype in numeric_dtypes:
numeric_features.append(i)
feature_skew = df[numeric_features].apply(
lambda x: skew(x)).sort_values(ascending=False)
skews = pd.DataFrame({'skew':feature_skew})
return feature_skew, numeric_features
def fix_skewness(df):
feature_skew, numeric_features = feature_skewness(df)
high_skew = feature_skew[feature_skew > 0.5]
skew_index = high_skew.index
for i in skew_index:
df[i] = boxcox1p(df[i], boxcox_normmax(df[i]+1))
skew_features = df[numeric_features].apply(
lambda x: skew(x)).sort_values(ascending=False)
skews = pd.DataFrame({'skew':skew_features})
return df
hr_skewed_part = fix_skewness(hr_scaled_part)
hr_skewed_part.hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
hr_skewed_part.describe()
hr_simple = hr_encoded.copy()
hr_simple.drop(['satisfaction_level',
'last_evaluation',
'average_montly_hours'], inplace=True, axis=1)
hr_ready = pd.DataFrame()
hr_simple.reset_index(drop=True, inplace=True)
hr_skewed_part.reset_index(drop=True, inplace=True)
hr_ready = pd.concat([hr_skewed_part,hr_simple], axis=1, sort=False, ignore_index=False)
# hr_ready['number_project'] = hr_ready['number_project'].astype('category').cat.codes
# hr_ready['time_spend_company'] = hr_ready['time_spend_company'].astype('category').cat.codes
hr_ready.head()
df_desc(hr_ready)
hr_ready.describe()
hr_ready.hist(bins = 20, figsize = (15,10), color = 'steelblue')
plt.tight_layout()
target = 'left'
split_ratio = 0.3
seed = 806
def split_dataset(df, target, split_ratio=0.3, seed=806):
features = list(df)
features.remove(target)
X = df[features]
y = df[[target]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=seed)
return X, y, X_train, X_test, y_train, y_test
X, y, X_train, X_test, y_train, y_test = split_dataset(hr_ready, target, split_ratio, seed)
print('Features:',X.shape[0], 'items | ', X.shape[1],'columns')
print('Target:',y.shape[0], 'items | ', y.shape[1],'columns')
print('Features Train:',X_train.shape[0], 'items | ', X_train.shape[1],'columns')
print('Features Test:',X_test.shape[0], 'items | ', X_test.shape[1],'columns')
print('Target Train:',y_train.shape[0], 'items | ', y_train.shape[1],'columns')
print('Target Test:',y_test.shape[0], 'items | ', y_test.shape[1],'columns')
lr = LogisticRegression(solver='lbfgs', max_iter = 300)
def lr_run(model, X_train, y_train, X_test, y_test):
result = model.fit(X_train, y_train.values.ravel())
y_pred = model.predict(X_test)
acc_test = model.score(X_test, y_test)
coefficients = pd.concat([pd.DataFrame(X_train.columns, columns=['Feature']), pd.DataFrame(np.transpose(model.coef_), columns=['Coef.'])], axis = 1)
coefficients.loc[-1] = ['intercept.', model.intercept_[0]]
coefficients.index = coefficients.index + 1
coefficients = coefficients.sort_index()
print('Accuracy on test: {:.3f}'.format(acc_test))
print()
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print()
print(coefficients)
lr_run(lr, X_train, y_train, X_test, y_test)
def plot_roc(model, X_test, y_test):
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show();
plot_roc(lr, X_test, y_test)
Cross Validation Strategy
def cv_acc (model, X_train, y_train, n_splits, seed):
kfold = model_selection.KFold(n_splits=n_splits, random_state=seed)
scoring = 'accuracy'
results = model_selection.cross_val_score(model, X_train, y_train.values.ravel(), cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))
print()
for i in range(len(results)):
print('Iteration', '{:>2}'.format(i+1), '| Accuracy: {:.2f}'.format(results[i]))
cv_acc(lr, X_train, y_train, 10, seed)
The dataset is copied to add or modify features.
hr_fe = hr_ready.copy()
Based on the EDA, we can bin the Satisfaction Level into 6 bins.
bins = [-1, 0.03, 0.29, 0.41, 0.69, 0.92, 1]
labels=['(0.00, 0.11]','(0.11, 0.35]','(0.35, 0.46]','(0.46, 0.71]','(0.71, 0.92]','(0.92, 1.00]']
hr_fe['satisfaction_level_bin'] = pd.cut(hr_fe.satisfaction_level, bins, labels=labels)
hr_fe.satisfaction_level_bin.value_counts()
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe.satisfaction_level,
hue=hr_fe.satisfaction_level_bin,
palette = sns.color_palette("hls", 6),
dodge = False)
plt.tight_layout()
hr_fe_1 = hr_fe.copy()
hr_fe_1 = onehot_encode(hr_fe_1)
hr_fe_1.drop('satisfaction_level', inplace=True, axis=1)
X_fe_1, y_fe_1, X_fe_1_train, X_fe_1_test, y_fe_1_train, y_fe_1_test = split_dataset(hr_fe_1, target, split_ratio, seed)
cv_acc(lr, X_fe_1_train, y_fe_1_train, 10, seed)
print()
lr_run(lr, X_fe_1_train, y_fe_1_train, X_fe_1_test, y_fe_1_test)
Based on the EDA, we can bin the Last Evaluation into 4 bins.
bins = [-1, 0.14, 0.34, 0.64, 1]
labels=['(0.00, 0.44]','(0.44, 0.57]','(0.57, 0.76]','(0.76, 1.00]']
hr_fe['last_evaluation_bin'] = pd.cut(hr_fe.last_evaluation, bins, labels=labels)
hr_fe_1['last_evaluation_bin'] = pd.cut(hr_fe_1.last_evaluation, bins, labels=labels)
hr_fe_1.last_evaluation_bin.value_counts()
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_1.last_evaluation,
hue=hr_fe_1.last_evaluation_bin,
palette = sns.color_palette("hls", 6),
dodge = False)
plt.tight_layout()
hr_fe_2 = hr_fe_1.copy()
hr_fe_2 = onehot_encode(hr_fe_2)
hr_fe_2.drop('last_evaluation', inplace=True, axis=1)
X_fe_2, y_fe_2, X_fe_2_train, X_fe_2_test, y_fe_2_train, y_fe_2_test = split_dataset(hr_fe_2, target, split_ratio, seed)
cv_acc(lr, X_fe_2_train, y_fe_2_train, 10, seed)
print()
lr_run(lr, X_fe_2_train, y_fe_2_train, X_fe_2_test, y_fe_2_test)
Based on the EDA, we can bin the Average Monthly Hours into 7 bins.
bins = [-1, 0.14, 0.165, 0.304, 0.565, 0.840, 0.897, 1]
labels=['(0, 125]','(125, 131]','(131, 161]','(161, 216]','(216, 274]','(274, 287]','(287, 310]']
hr_fe['average_montly_hours_bin'] = pd.cut(hr_fe.average_montly_hours, bins, labels=labels)
hr_fe_2['average_montly_hours_bin'] = pd.cut(hr_fe_2.average_montly_hours, bins, labels=labels)
hr_fe_2.average_montly_hours_bin.value_counts()
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_2.average_montly_hours,
hue=hr_fe_2.average_montly_hours_bin,
palette = sns.color_palette("hls", 7),
dodge = False)
plt.tight_layout()
hr_fe_3 = hr_fe_2.copy()
hr_fe_3 = onehot_encode(hr_fe_3)
hr_fe_3.drop('average_montly_hours', inplace=True, axis=1)
X_fe_3, y_fe_3, X_fe_3_train, X_fe_3_test, y_fe_3_train, y_fe_3_test = split_dataset(hr_fe_3, target, split_ratio, seed)
cv_acc(lr, X_fe_3_train, y_fe_3_train, 10, seed)
print()
lr_run(lr, X_fe_3_train, y_fe_3_train, X_fe_3_test, y_fe_3_test)
Based on the EDA, the Number of Projects can be categorized into 4 categories.
categ = {2:'too low', 3:'normal', 4:'normal', 5:'normal', 6:'too high', 7:'extreme'}
hr_fe['number_project_cat'] = hr_fe.number_project.map(categ)
hr_fe_3['number_project_cat'] = hr_fe_3.number_project.map(categ)
hr_fe_3.number_project_cat.value_counts()
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_3.number_project,
hue=hr_fe_3.number_project_cat,
palette = sns.color_palette("hls", 6),
dodge = False)
plt.tight_layout()
hr_fe_4 = hr_fe_3.copy()
hr_fe_4 = onehot_encode(hr_fe_4)
hr_fe_4.drop('number_project', inplace=True, axis=1)
X_fe_4, y_fe_4, X_fe_4_train, X_fe_4_test, y_fe_4_train, y_fe_4_test = split_dataset(hr_fe_4, target, split_ratio, seed)
cv_acc(lr, X_fe_4_train, y_fe_4_train, 10, seed)
print()
lr_run(lr, X_fe_4_train, y_fe_4_train, X_fe_4_test, y_fe_4_test)
Based on the EDA, the Time Spent in Company can be categorized into 4 categories, related to the rate of departure.
categ = {2:'low departure', 3:'high departure', 4:'high departure', 5:'very high departure', 6:'high departure', 7:'no departure', 8:'no departure', 10:'no departure'}
hr_fe['time_spend_company_cat'] = hr_fe.time_spend_company.map(categ)
hr_fe_4['time_spend_company_cat'] = hr_fe_4.time_spend_company.map(categ)
hr_fe_4.time_spend_company_cat.value_counts()
plt.figure(figsize=(15,5))
sns.countplot(x=hr_fe_4.time_spend_company,
hue=hr_fe_4.time_spend_company_cat,
palette = sns.color_palette("hls", 7),
dodge = False)
plt.tight_layout()
hr_fe_5 = hr_fe_4.copy()
hr_fe_5 = onehot_encode(hr_fe_5)
hr_fe_5.drop('time_spend_company', inplace=True, axis=1)
X_fe_5, y_fe_5, X_fe_5_train, X_fe_5_test, y_fe_5_train, y_fe_5_test = split_dataset(hr_fe_5, target, split_ratio, seed)
cv_acc(lr, X_fe_5_train, y_fe_5_train, 10, seed)
print()
lr_run(lr, X_fe_5_train, y_fe_5_train, X_fe_5_test, y_fe_5_test)
Based on the EDA, the employees can be clustered by Workload, based on the Number of Projects and Average Monthly Hours, into 5 categories.
def workload_cluster(row):
if (row['average_montly_hours_bin'] == '(0, 125]'):
return 'very low'
if (row['number_project'] <= 2) and (row['average_montly_hours_bin'] in ['(125, 131]','(131, 161]']):
return 'low'
if (row['number_project'] >= 4) and (row['average_montly_hours_bin'] in ['(216, 274]','(274, 287]']):
return 'high'
if (row['average_montly_hours_bin'] in ['(287, 310]']):
return 'extreme'
return 'normal'
hr_fe['workload'] = hr_fe.apply(lambda row: workload_cluster(row), axis=1)
hr_fe.workload.value_counts()
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.average_montly_hours,
y=hr_fe.number_project,
hue=hr_fe.workload,
palette = sns.color_palette("hls", 5))
plt.tight_layout()
hr_fe_6 = hr_fe.copy()
hr_fe_6 = onehot_encode(hr_fe_6)
hr_fe_6.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_6.drop('last_evaluation', inplace=True, axis=1)
hr_fe_6.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_6.drop('number_project', inplace=True, axis=1)
hr_fe_6.drop('time_spend_company', inplace=True, axis=1)
X_fe_6, y_fe_6, X_fe_6_train, X_fe_6_test, y_fe_6_train, y_fe_6_test = split_dataset(hr_fe_6, target, split_ratio, seed)
cv_acc(lr, X_fe_6_train, y_fe_6_train, 10, seed)
print()
lr_run(lr, X_fe_6_train, y_fe_6_train, X_fe_6_test, y_fe_6_test)
Based on the EDA, the employees can be clustered by Project Performance, based on the Number of Projects and Last Evaluation, into 4 categories.
def project_performance_cluster(row):
if (row['last_evaluation_bin'] == '(0.00, 0.44]'):
return 'very low'
if (row['number_project'] <= 2) and (row['last_evaluation_bin'] in ['(0.44, 0.57]']):
return 'low'
if (row['number_project'] >= 4) and (row['last_evaluation_bin'] in ['(0.76, 1.00]']):
return 'high'
return 'normal'
hr_fe['project_performance'] = hr_fe.apply(lambda row: project_performance_cluster(row), axis=1)
hr_fe.project_performance.value_counts()
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.last_evaluation,
y=hr_fe.number_project,
hue=hr_fe.project_performance,
palette = sns.color_palette("hls", 4))
plt.tight_layout()
hr_fe_7 = hr_fe.copy()
hr_fe_7 = onehot_encode(hr_fe_7)
hr_fe_7.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_7.drop('last_evaluation', inplace=True, axis=1)
hr_fe_7.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_7.drop('number_project', inplace=True, axis=1)
hr_fe_7.drop('time_spend_company', inplace=True, axis=1)
X_fe_7, y_fe_7, X_fe_7_train, X_fe_7_test, y_fe_7_train, y_fe_7_test = split_dataset(hr_fe_7, target, split_ratio, seed)
cv_acc(lr, X_fe_7_train, y_fe_7_train, 10, seed)
print()
lr_run(lr, X_fe_7_train, y_fe_7_train, X_fe_7_test, y_fe_7_test)
Based on the EDA, the employees can be clustered by Efficiency, based on the Last Evaluation and the Average Monthly Hours, into 4 categories.
def efficiency_cluster(row):
if (row['last_evaluation_bin'] == '(0.00, 0.44]'):
return 'very low'
if (row['average_montly_hours_bin'] in ['(0, 125]']):
return 'very low'
if (row['last_evaluation_bin'] in ['(0.44, 0.57]']) and (row['average_montly_hours_bin'] in ['(125, 131]', '(131, 161]']):
return 'low'
if (row['last_evaluation_bin'] in ['(0.76, 1.00]']) and (row['average_montly_hours_bin'] in ['(216, 274]', '(274, 287]','(287, 310]']):
return 'high'
return 'normal'
hr_fe['efficiency'] = hr_fe.apply(lambda row: efficiency_cluster(row), axis=1)
hr_fe.efficiency.value_counts()
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.average_montly_hours,
y=hr_fe.last_evaluation,
hue=hr_fe.efficiency,
palette = sns.color_palette("hls", 4))
plt.tight_layout()
hr_fe_8 = hr_fe.copy()
hr_fe_8 = onehot_encode(hr_fe_8)
hr_fe_8.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_8.drop('last_evaluation', inplace=True, axis=1)
hr_fe_8.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_8.drop('number_project', inplace=True, axis=1)
hr_fe_8.drop('time_spend_company', inplace=True, axis=1)
X_fe_8, y_fe_8, X_fe_8_train, X_fe_8_test, y_fe_8_train, y_fe_8_test = split_dataset(hr_fe_8, target, split_ratio, seed)
cv_acc(lr, X_fe_8_train, y_fe_8_train, 10, seed)
print()
lr_run(lr, X_fe_8_train, y_fe_8_train, X_fe_8_test, y_fe_8_test)
Based on the EDA, the employees can be clustered by Attitude, based on the Last Evaluation and the Satisfaction Level, into 7 categories.
def attitude_cluster(row):
if (row['last_evaluation_bin'] == '(0.00, 0.44]'):
return 'low performance'
if (row['satisfaction_level_bin'] in ['(0.92, 1.00]']):
return 'very happy'
if (row['last_evaluation_bin'] in ['(0.76, 1.00]']) and (row['satisfaction_level_bin'] in ['(0.71, 0.92]']):
return 'happy and high performance'
if (row['last_evaluation_bin'] in ['(0.44, 0.57]']) and (row['satisfaction_level_bin'] in ['(0.35, 0.46]']):
return 'unhappy and low performance'
if (row['satisfaction_level_bin'] in ['(0.00, 0.11]']):
return 'very unhappy'
if (row['satisfaction_level_bin'] in ['(0.11, 0.35]','(0.35, 0.46]']):
return 'unhappy'
return 'normal'
hr_fe['attitude'] = hr_fe.apply(lambda row: attitude_cluster(row), axis=1)
hr_fe.attitude.value_counts()
plt.figure(figsize=(15,5))
sns.scatterplot(x=hr_fe.satisfaction_level,
y=hr_fe.last_evaluation,
hue=hr_fe.attitude,
palette = sns.color_palette("hls", 7))
plt.tight_layout()
hr_fe_9 = hr_fe.copy()
hr_fe_9 = onehot_encode(hr_fe_9)
hr_fe_9.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_9.drop('last_evaluation', inplace=True, axis=1)
hr_fe_9.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_9.drop('number_project', inplace=True, axis=1)
hr_fe_9.drop('time_spend_company', inplace=True, axis=1)
X_fe_9, y_fe_9, X_fe_9_train, X_fe_9_test, y_fe_9_train, y_fe_9_test = split_dataset(hr_fe_9, target, split_ratio, seed)
cv_acc(lr, X_fe_9_train, y_fe_9_train, 10, seed)
print()
lr_run(lr, X_fe_9_train, y_fe_9_train, X_fe_9_test, y_fe_9_test)
The variables which have been binned are removed from the dataset, and new features are one hot encoded.
hr_fe_encoded = onehot_encode(hr_fe)
hr_fe_encoded.drop('satisfaction_level', inplace=True, axis=1)
hr_fe_encoded.drop('last_evaluation', inplace=True, axis=1)
hr_fe_encoded.drop('average_montly_hours', inplace=True, axis=1)
hr_fe_encoded.drop('number_project', inplace=True, axis=1)
hr_fe_encoded.drop('time_spend_company', inplace=True, axis=1)
df_desc(hr_fe_encoded)
The dataset resulting from the Feature Engineering phase contains 58 features, with a model reaching the accuracy of 0.964. The Feature Selection phase aims to reduce the number of variables used by the model.
X_fe_encoded, y_fe_encoded, X_fe_encoded_train, X_fe_encoded_test, y_fe_encoded_train, y_fe_encoded_test = split_dataset(hr_fe_encoded, target, split_ratio, seed)
cv_acc(lr, X_fe_encoded_train, y_fe_encoded_train, 10, seed)
print()
lr_run(lr, X_fe_encoded_train, y_fe_encoded_train, X_fe_encoded_test, y_fe_encoded_test)
plot_roc(lr, X_fe_encoded_test, y_fe_encoded_test)
accuracies = pd.DataFrame(columns=['features','accuracy', 'cols'])
print('Iterations:')
for i in range(1, len(X_fe_encoded.columns)+1):
logreg = LogisticRegression(solver='lbfgs', max_iter=250)
rfe = RFE(logreg, i)
rfe = rfe.fit(X_fe_encoded, y_fe_encoded.values.ravel())
cols_rfe = list(X_fe_encoded.loc[:, rfe.support_])
X_rfe_sel = X_fe_encoded_train[cols_rfe]
X_rfe_test_sel = X_fe_encoded_test[cols_rfe]
result = logreg.fit(X_rfe_sel, y_fe_encoded_train.values.ravel())
acc_test = logreg.score(X_rfe_test_sel, y_fe_encoded_test)
accuracies.loc[i] = [i, acc_test, cols_rfe]
print(i, end=' ')
# Line Plot
plt.figure(figsize=(15,5))
sns.lineplot(x = accuracies['features'],
y = accuracies['accuracy'],
color = 'steelblue')#.axes.set_xlim(min(hr.last_evaluation),max(hr.last_evaluation))
plt.tight_layout()
accuracies.nlargest(10, 'accuracy')
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
features_rfe = list(hr_fe_encoded)
features_rfe.remove(target)
X_rfe = hr_fe_encoded.loc[:, features_rfe]
y_rfe = hr_fe_encoded.loc[:, target]
logreg = LogisticRegression(solver='lbfgs', max_iter=250)
rfe = RFE(logreg, accuracies.nlargest(1,'accuracy').features.values.ravel()[0])
rfe = rfe.fit(X_rfe, y_rfe)
print(sum(rfe.support_),'selected features:')
for i in list(X_rfe.loc[:, rfe.support_]):
print(i)
train = pd.read_csv('C:\\Users\\RIA SHARMA\\Desktop\\data\\data\\train.csv')
test = pd.read_csv('C:\\Users\\RIA SHARMA\\Desktop\\data\\data\\test.csv')
# getting their shapes
print("Shape of train :", train.shape)
print("Shape of test :", test.shape)
train.head()
test.head()
# describing the training set
train.describe(include = 'all')
train.info()
# checking if there is any NULL value in the dataset
train.isnull().any()
test.isnull().sum()
# checkig the no. of Employees Promoted
train['is_promoted'].value_counts()
# finding the %age of people promoted
promoted = (4668/54808)*100
print("Percentage of Promoted Employees is {:.2f}%".format(promoted))
#plotting a scatter plot
plt.hist(train['is_promoted'])
plt.title('plot to show the gap in Promoted and Non-Promoted Employees', fontsize = 30)
plt.xlabel('0 -No Promotion and 1- Promotion', fontsize = 20)
plt.ylabel('count')
plt.show()
# checking the distribution of the avg_training score of the Employees
plt.rcParams['figure.figsize'] = (15, 7)
sns.distplot(train['avg_training_score'], color = 'blue')
plt.title('Distribution of Training Score among the Employees', fontsize = 30)
plt.xlabel('Average Training Score', fontsize = 20)
plt.ylabel('count')
plt.show()
train['awards_won?'].value_counts()
# plotting a donut chart for visualizing each of the recruitment channel's share
size = [53538, 1270]
colors = ['magenta', 'brown']
labels = "Awards Won", "NO Awards Won"
my_circle = plt.Circle((0, 0), 0.7, color = 'white')
plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Showing a Percentage of employees who won awards', fontsize = 30)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.legend()
plt.show()
train['KPIs_met >80%'].value_counts()
# plotting a pie chart
size = [35517, 19291]
labels = "Not Met KPI > 80%", "Met KPI > 80%"
colors = ['violet', 'grey']
explode = [0, 0.1]
plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, autopct = "%.2f%%")
plt.title('A Pie Chart Representing Gap in Employees in terms of KPI', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()
# checking the distribution of length of service
sns.distplot(train['length_of_service'], color = 'green')
plt.title('Distribution of length of service among the Employees', fontsize = 30)
plt.xlabel('Length of Service in years', fontsize = 15)
plt.ylabel('count')
plt.show()
train['previous_year_rating'].value_counts().sort_values().plot.bar(color = 'violet', figsize = (15, 7))
plt.title('Distribution of Previous year rating of the Employees', fontsize = 30)
plt.xlabel('Ratings', fontsize = 15)
plt.ylabel('count')
plt.show()
# checking the distribution of age of Employees in the company
sns.distplot(train['age'], color = 'red')
plt.title('Distribution of Age of Employees', fontsize = 30)
plt.xlabel('Age', fontsize = 15)
plt.ylabel('count')
plt.show()
# checking the different no. of training done by the employees
plt.rcParams['figure.figsize'] = (17, 7)
sns.violinplot(train['no_of_trainings'], color = 'purple')
plt.title('No. of trainings done by the Employees', fontsize = 30)
plt.xlabel('No. of Trainings', fontsize = 15)
plt.ylabel('Frequency')
plt.show()
# checking the different types of recruitment channels for the company
train['recruitment_channel'].value_counts()
# plotting a donut chart for visualizing each of the recruitment channel's share
size = [30446, 23220, 1142]
colors = ['yellow', 'red', 'lightgreen']
labels = "Others", "Sourcing", "Reffered"
my_circle = plt.Circle((0, 0), 0.7, color = 'white')
plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(size, colors = colors, labels = labels, shadow = True, autopct = '%.2f%%')
plt.title('Showing share of different Recruitment Channels', fontsize = 30)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.legend()
plt.show()
# checking the gender gap
train['gender'].value_counts()
# plotting a pie chart
size = [38496, 16312]
labels = "Male", "Female"
colors = ['yellow', 'orange']
explode = [0, 0.1]
plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, autopct = "%.2f%%")
plt.title('A Pie Chart Representing GenderGap', fontsize = 30)
plt.axis('off')
plt.legend()
plt.show()
# checking the different regions of the company
plt.rcParams['figure.figsize'] = (20, 10)
sns.countplot(train['region'], color = 'pink')
plt.title('Different Regions in the company', fontsize = 30)
plt.xticks(rotation = 60)
plt.xlabel('Region Code', fontsize = 15)
plt.ylabel('count', fontsize = 15)
plt.show()
# scatter plot between average training score and is_promoted
data = pd.crosstab(train['avg_training_score'], train['is_promoted'])
data.div(data.sum(1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 9), color = ['darkred', 'lightgreen'])
plt.title('Looking at the Dependency of Training Score in promotion', fontsize = 30)
plt.xlabel('Average Training Scores', fontsize = 15)
plt.legend()
plt.show()
# checking dependency of different regions in promotion
data = pd.crosstab(train['region'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['lightblue', 'purple'])
plt.title('Dependency of Regions in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Different Regions of the Company', fontsize = 20)
plt.legend()
plt.show()
# dependency of awards won on promotion
data = pd.crosstab(train['awards_won?'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (10, 8), color = ['magenta', 'purple'])
plt.title('Dependency of Awards in determining Promotion', fontsize = 30)
plt.xlabel('Awards Won or Not', fontsize = 20)
plt.legend()
plt.show()
#dependency of KPIs with Promotion
data = pd.crosstab(train['KPIs_met >80%'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (10, 8), color = ['pink', 'darkred'])
plt.title('Dependency of KPIs in determining Promotion', fontsize = 30)
plt.xlabel('KPIs Met or Not', fontsize = 20)
plt.legend()
plt.show()
# checking dependency on previous years' ratings
data = pd.crosstab(train['previous_year_rating'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (15, 8), color = ['violet', 'pink'])
plt.title('Dependency of Previous year Ratings in determining Promotion', fontsize = 30)
plt.xlabel('Different Ratings', fontsize = 20)
plt.legend()
plt.show()
# checking how length of service determines the promotion of employees
data = pd.crosstab(train['length_of_service'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['pink', 'lightblue'])
plt.title('Dependency of Length of service in Promotions of Employees', fontsize = 30)
plt.xlabel('Length of service of employees', fontsize = 20)
plt.legend()
plt.show()
# checking dependency of age factor in promotion of employees
data = pd.crosstab(train['age'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['lightblue', 'green'])
plt.title('Dependency of Age in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Age of Employees', fontsize = 20)
plt.legend()
plt.show()
# checking which department got most number of promotions
data = pd.crosstab(train['department'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (20, 8), color = ['orange', 'lightgreen'])
plt.title('Dependency of Departments in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Different Departments of the Company', fontsize = 20)
plt.legend()
plt.show()
# checking dependency of gender over promotion
data = pd.crosstab(train['gender'], train['is_promoted'])
data.div(data.sum(1).astype('float'), axis = 0).plot(kind = 'bar', stacked = True, figsize = (7, 5), color = ['pink', 'yellow'])
plt.title('Dependency of Genders in determining Promotion of Employees', fontsize = 30)
plt.xlabel('Gender', fontsize = 20)
plt.legend()
plt.show()
# filling missing values
train['education'].fillna(train['education'].mode()[0], inplace = True)
train['previous_year_rating'].fillna(1, inplace = True)
# again checking if there is any Null value left in the data
train.isnull().sum().sum()
# filling missing values
test['education'].fillna(test['education'].mode()[0], inplace = True)
test['previous_year_rating'].fillna(1, inplace = True)
# again checking if there is any Null value left in the data
test.isnull().sum().sum()
# removing the employee_id column
train = train.drop(['employee_id'], axis = 1)
train.columns
# saving the employee_id
emp_id = test['employee_id']
# removing the employee_id column
test = test.drop(['employee_id'], axis = 1)
test.columns
# defining the test set
x_test = test
x_test.columns
# one hot encoding for the test set
x_test = pd.get_dummies(x_test)
x_test.columns
# splitting the train set into dependent and independent sets
x = train.iloc[:, :-1]
y = train.iloc[:, -1]
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)
# one hot encoding for the train set
x = pd.get_dummies(x)
x.columns